#!/bin/bash

# ------------------------------ Configuration ------------------------------
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DATASET_PARENT_DIR="$SCRIPT_DIR/dataset"

# Parent output directory (all results stored here)
OUTPUT_PARENT="$SCRIPT_DIR/eng_content_gen_gptnopersona"

# Python evaluation script (this directory)
SCRIPT_PATH="$SCRIPT_DIR/eng_nopersona_gpt.py"

# GPUs to use – edit as needed
GPUS=(0 1 2 3 4 5 6 7)
MAX_JOBS_PER_GPU=1
TOTAL_JOBS=30

mkdir -p "$OUTPUT_PARENT" "$OUTPUT_PARENT/logs"

# ---------------------------------------------------------------------------
# Helper: launch evaluation jobs for a single dataset file
#   $1 : absolute path to dataset .jsonl file
launch_dataset() {
  local DATA_PATH="$1"
  local BASENAME
  BASENAME=$(basename "$DATA_PATH")
  # Remove spaces and parentheses from the basename for folder names
  local SAFE_BASENAME
  SAFE_BASENAME=$(echo "$BASENAME" | tr -d ' ()')
  local OUT_DIR="$OUTPUT_PARENT/$SAFE_BASENAME"
  mkdir -p "$OUT_DIR"

  local TOTAL_LINES
  TOTAL_LINES=$(wc -l < "$DATA_PATH")
  echo "[INFO] Dataset: $BASENAME  (lines=$TOTAL_LINES)"

  # Determine chunk size so that we run at most TOTAL_JOBS concurrent workers
  local LINES_PER_JOB=$(( (TOTAL_LINES + TOTAL_JOBS - 1) / TOTAL_JOBS ))
  echo "[INFO] Splitting into chunks of $LINES_PER_JOB lines across $TOTAL_JOBS jobs"

  local job_counter=0 start_idx=0
  while [ $start_idx -lt $TOTAL_LINES ]; do
    local end_idx=$((start_idx + LINES_PER_JOB - 1))
    if [ $end_idx -ge $TOTAL_LINES ]; then
      end_idx=$((TOTAL_LINES - 1))
    fi

    local gpu_idx=$((job_counter % ${#GPUS[@]}))
    local gpu_id=${GPUS[$gpu_idx]}

    echo "    Launching slice $start_idx-$end_idx on GPU $gpu_id"
    CUDA_VISIBLE_DEVICES=$gpu_id nohup python -u "$SCRIPT_PATH" \
      --tweet_eval \
      --dataset_paths "$DATA_PATH" \
      --start "$start_idx" \
      --end "$end_idx" \
      --output_dir "$OUT_DIR" \
      --gpu_id 0 \
      > "$OUTPUT_PARENT/logs/${SAFE_BASENAME}_${start_idx}_${end_idx}_gpu${gpu_id}.log" 2>&1 &

    ((job_counter++))
    start_idx=$((end_idx + 1))

    # If we've launched TOTAL_JOBS concurrent tasks, wait for them to finish
    if [ $((job_counter % TOTAL_JOBS)) -eq 0 ]; then
      echo "[INFO] Waiting for current batch of jobs to finish..."
      wait
      echo "[INFO] Resuming launches..."
    fi
  done

  # Wait for remaining background jobs
  wait

  # ---------------- Merge partial outputs ----------------
  echo "[INFO] Merging partial outputs for $BASENAME ..."
  python - <<PY
import glob, json, os, sys
out_dir = "$OUT_DIR"
parent_dir = "$OUTPUT_PARENT"
pattern = os.path.join(out_dir, "gmo_results_*_*.json")
merged = []
for fp in sorted(glob.glob(pattern)):
    with open(fp, 'r') as f:
        try:
            merged.extend(json.load(f))
        except Exception as e:
            print(f"[WARN] Could not read {fp}: {e}", file=sys.stderr)

merged_path = os.path.join(parent_dir, f"${SAFE_BASENAME}_merged.json")
with open(merged_path, 'w') as f:
    json.dump(merged, f, indent=2)
print(f"[INFO] -> Merged file written: {merged_path}  (records={len(merged)})")
PY
}

# ------------------------------ Main ----------------------------------------
shopt -s nullglob
DATASETS=("$DATASET_PARENT_DIR"/*.jsonl)

if [ ${#DATASETS[@]} -eq 0 ]; then
  echo "[ERROR] No .jsonl datasets found in $DATASET_PARENT_DIR" >&2
  exit 1
fi

for d in "${DATASETS[@]}"; do
  launch_dataset "$d"
  echo "[INFO] Finished dataset $(basename "$d")"
  echo "-------------------------------------------------------------"
done

# ---------------- Merge all week-level merged JSONs into one ----------------
echo "[INFO] Creating final merge.json across all weeks ..."
python - <<PY
import glob, json, os, sys
parent_dir = "$OUTPUT_PARENT"
merged_files = sorted(glob.glob(os.path.join(parent_dir, "*_merged.json")))
grand = []
for fp in merged_files:
    with open(fp, 'r') as f:
        try:
            grand.extend(json.load(f))
        except Exception as e:
            print(f"[WARN] Could not read {fp}: {e}", file=sys.stderr)

final_path = os.path.join(parent_dir, "merge.json")
with open(final_path, 'w') as f:
    json.dump(grand, f, indent=2)
print(f"[INFO] -> Final merged file written: {final_path}  (records={len(grand)})")
PY

echo "[INFO] All datasets processed. Individual and final merged outputs are in $OUTPUT_PARENT"
